This notebook is intended to guide users through some of the major portions of analysis completed for clustering of larval zebrafish retinal ganglion cells. Most of the analysis was run using functionalites within the R pacakge Seurat (Satija et al.,Nature Biotechnology, 2015), an actively maintained set of tools for scRNA-seq analysis (https://satijalab.org/seurat/). utilFxns.R and plottingFxns.R are scripts containing custom scripts used for analysis and generating figures. These scripts are avaiable in the utils folder at https://github.com/shekharlab/ZebrafishRGC.
The first step is to load necessary packages and libraries.
library(Seurat)
library(dplyr)
library(reshape2)
library(ggplot2)
library(cowplot)
library(cluster)
source("utils/utilFxns.R")
source("utils/plottingFxns.R")
Next, we create the initial Seurat object by loading the Count matrix. Count matrices are currently available at https://drive.google.com/drive/folders/1baRKtDkD4d8-6tG8P9v8VcjtUDpWeq5m?usp=sharing, but will later be avaiable from the Gene Expression Omnibus. The Count matrix is subsetted to remove a faulty sample and then used to create a Seurat object.
# Load larval zebrafish count matrix
Count.mat <- readRDS("../CountMatrices/ConsolidatedCounts_Zfish_013020.rds")
Count.mat <- readRDS("../CountMatrices/ConsolidatedCounts_LarvaZFish_02032020")
# Remove ZfishRGC17 due to wetting sample failure
cells.remove = grep("ZfishRGC17_", colnames(Count.mat), value = TRUE)
Count.mat = Count.mat[, setdiff(colnames(Count.mat), cells.remove)]
# Create Seurat object, removing genes that are expressed in
# fewer than 25 cells and removing cells that have fewer than
# 450 features
larva <- CreateSeuratObject(counts = Count.mat, project = "larvaRGC",
min.cells = 25, min.features = 450)
Check quality metrics of the data, including RNA counts and mitochondrial scores.
larva[["percent.mt"]] <- PercentageFeatureSet(larva, pattern = "^MT-")
larva[["percent.rps"]] <- PercentageFeatureSet(larva, pattern = "^RPS")
larva[["percent.rpl"]] <- PercentageFeatureSet(larva, pattern = "^RPL")
larva[["percent.rp"]] <- larva[["percent.rps"]] + larva[["percent.rpl"]]
# Create Violin Plots of RNA counts, mitochondrial
# percentages, and ribosomal percentages
VlnPlot(larva, features = "nCount_RNA", pt.size = 0.3)
VlnPlot(larva, features = "percent.mt", pt.size = 0.3)
VlnPlot(larva, features = "percent.rp", pt.size = 0.3)
Set batch information.
# Change the order of factor
larva@meta.data$orig.ident = factor(larva@meta.data$orig.ident,
levels = paste0("ZfishRGC", c(18:20)))
# Set the batch information in meta.data
batchname = as.character(larva@meta.data$orig.ident)
batchid = rep("Batch0", length(batchname))
batchid[grep("ZfishRGC18", batchname)] = "Batch1"
batchid[grep("ZfishRGC19", batchname)] = "Batch2"
batchid[grep("ZfishRGC20", batchname)] = "Batch3"
larva@meta.data$batch = factor(batchid)
table(larva@meta.data$orig.ident, larva@meta.data$batch)
##
## Batch1 Batch2 Batch3
## ZfishRGC18 4152 0 0
## ZfishRGC19 0 3889 0
## ZfishRGC20 0 0 4657
Using functionalities within Seurat, cluster the dataset. The data is first log-normalized and then all genes are scaled. The top 2000 variable genes are then detected and used to run PCA. The top 30 principle components were selected using an elbow plot to calculate cluster assignments.
# Log normalize the data, identify the top 2000 variable
# features, and scale all genes
larva <- NormalizeData(larva, normalization.method = "LogNormalize",
scale.factor = 10000)
all.genes <- rownames(larva)
larva <- ScaleData(larva, features = all.genes)
# Identify and run PCA on variable features and visualize the
# dimensionality of the dataset using an elbow plot
larva <- FindVariableFeatures(larva, selection.method = "vst",
nfeatures = 2000)
larva <- RunPCA(larva, features = VariableFeatures(object = larva))
ElbowPlot(larva, ndims = 50)
# Find nearest neighbors using 30 PCs and cluster the cells.
# Visualize using tSNE and UMAP
larva <- FindNeighbors(larva, dims = 1:30)
larva <- FindClusters(larva, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 12698
## Number of edges: 473778
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9216
## Number of communities: 25
## Elapsed time: 1 seconds
larva <- RunTSNE(larva, dims = 1:30)
larva <- RunUMAP(larva, dims = 1:30)
# Visualize clusters
DimPlot(larva, reduction = "tsne")
DimPlot(larva, reduction = "umap")
Check for batch effects within the data by grouping the tSNE plot by batch.
DimPlot(larva, reduction = "tsne", group.by = "orig.ident", cells = sample(colnames(larva)))
Investigate expression of canonical RGC markers to check for contaminant cell types.
VlnPlot(larva, "nFeature_RNA", pt.size = 0)
VlnPlot(larva, "RBPMS2B", pt.size = 0)
Investigate cluster markers thought to be contaminants.
# 17 - Amacrine cell (TFAP2B)
markers17 = FindMarkers(larva, ident.1 = 17, test.use = "MAST",
max.cells.per.ident = 1000)
# 21 - Amacrine cell (GAD1B, GAD2, SLC6A1B)
markers21 = FindMarkers(larva, ident.1 = 21, test.use = "MAST",
max.cells.per.ident = 1000)
# 22 - Photoreceptor (PDE6G, SAGB)
markers22 = FindMarkers(larva, ident.1 = 22, test.use = "MAST",
max.cells.per.ident = 1000)
# 23 - Photoreceptor (PDE6H, ARR3A, RBP4L)
markers23 = FindMarkers(larva, ident.1 = 23, test.use = "MAST",
max.cells.per.ident = 1000)
Remove contaminant cell types.
cells.remove = WhichCells(larva, idents = c(17, 21, 22, 23))
larva = SubsetData(larva, cells = setdiff(colnames(larva), cells.remove))
Since a large number of cells were removed, repeat clustering steps using the standard workflow.
larva <- NormalizeData(larva, normalization.method = "LogNormalize",
scale.factor = 10000)
larva <- FindVariableFeatures(larva, selection.method = "vst",
nfeatures = 2000)
larva <- ScaleData(larva, verbose = FALSE)
larva <- RunPCA(larva, npcs = 40, verbose = FALSE)
larva <- FindNeighbors(larva, dims = 1:40)
larva <- FindClusters(larva)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 12119
## Number of edges: 475929
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9124
## Number of communities: 31
## Elapsed time: 1 seconds
larva <- RunUMAP(larva, reduction = "pca", dims = 1:40)
larva <- RunTSNE(larva, reduction = "pca", dims = 1:40)
Identify all variable features.
lar.markers <- FindAllMarkers(larva, only.pos = TRUE, min.pct = 0.25,
logfc.threshold = 0.25)
lar.markers %>% group_by(cluster) %>% top_n(n = 1, wt = avg_logFC)
## # A tibble: 31 x 7
## # Groups: cluster [31]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 5.86e-253 1.20 0.838 0.381 7.06e-249 0 FAM60AL
## 2 0. 1.34 0.7 0.144 0. 1 ISL2A
## 3 0. 1.89 0.898 0.194 0. 2 MEIS2B
## 4 0. 2.96 0.557 0.03 0. 3 RDH10A
## 5 1.39e-176 1.42 0.523 0.143 1.68e-172 4 FOXP1B
## 6 3.28e-222 1.26 0.744 0.225 3.96e-218 5 KCNIP3B
## 7 0. 1.90 0.791 0.114 0. 6 EFHD1
## 8 0. 2.14 0.592 0.04 0. 7 NEUROD1
## 9 9.34e- 19 1.52 0.279 0.143 1.13e- 14 8 NPB
## 10 0. 2.00 0.6 0.075 0. 9 RPRMA
## # … with 21 more rows
Remove remaining any non valid clusters and set final cluster assignments.
# Cluster 5: Lacks of unique differentially expressed genes
head(subset(lar.markers, cluster == 5))
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster
## KCNIP3B 3.283454e-222 1.2565768 0.744 0.225 3.958203e-218 5
## CALB2A 1.434951e-195 1.0222868 0.992 0.625 1.729834e-191 5
## SI:DKEYP-72G9.4 4.767464e-126 0.8113346 0.931 0.677 5.747178e-122 5
## BHLHE40 2.781606e-124 0.8927339 0.948 0.686 3.353226e-120 5
## MARCKSL1B 1.963600e-119 0.4111866 1.000 0.996 2.367120e-115 5
## CPLX2L 3.313262e-96 0.5549882 0.956 0.846 3.994137e-92 5
## gene
## KCNIP3B KCNIP3B
## CALB2A CALB2A
## SI:DKEYP-72G9.4 SI:DKEYP-72G9.4
## BHLHE40 BHLHE40
## MARCKSL1B MARCKSL1B
## CPLX2L CPLX2L
# Cluster 28: Muller Glia (RLBP1A, APOEB)
VlnPlot(larva, "RLBP1A", pt.size = 0)
# Remove clusters
Idents(larva) <- "RNA_snn_res.0.8"
cells.remove = WhichCells(larva, idents = c(5, 28))
larva = SubsetData(larva, cells = setdiff(colnames(larva), cells.remove))
## Warning: 'SubsetData' is deprecated.
## Use 'subset' instead.
## See help("Deprecated")
## Warning: 'OldWhichCells' is deprecated.
## Use 'WhichCells' instead.
## See help("Deprecated")
# Set final cluster assignments
larva@meta.data$clusterID = droplevels(Idents(larva))
levels(larva@meta.data$clusterID) = 1:length(levels(larva@meta.data$clusterID))
Idents(larva) <- "clusterID"
Construct dendrogram based on hierarchical clustering and reorder clusters.
# Build dendrogram
Idents(larva) <- "clusterID"
larva <- FindVariableFeatures(larva, selection.method = "vst",
nfeatures = 500)
larva <- BuildClusterTree(larva)
# Visualize the dendrogram
PlotClusterTree(larva)
plot(larva@tools$BuildClusterTree)
# Reorder clusters according to dendrogram for dotplot
# plotting
tree_obj = larva@tools$BuildClusterTree
left_clusts = Seurat:::GetLeftDescendants(tree_obj, length(levels(larva@meta.data$clusterID)) +
1)
right_clusts = Seurat:::GetRightDescendants(tree_obj, length(levels(larva@meta.data$clusterID)) +
1)
clust_order = c(left_clusts, right_clusts)
larva@meta.data$dendro_order = factor(larva@meta.data$clusterID,
levels = clust_order)
Idents(larva) <- "dendro_order"
Identify all variable features
lar.markers <- FindAllMarkers(larva, only.pos = TRUE, min.pct = 0.25,
logfc.threshold = 0.25)
Plot canonical RGC markers as well as markers that differentiate immature and mature clusters.
# Convert gene names to lowercase
larva <- LowerCase_genes(larva)
# Plot canonical RGC markers
DotPlot(larva, features = tolower(c("ROBO2", "ISL2B", "RBPMS2B"))) +
RotatedAxis()
# Plot immature and mature markers
DotPlot(larva, features = c("aldocb", "bhlhe41", "fam107b", "bhlhe40",
"tmsb", "alcamb", "tubb5", "fam60al")) + RotatedAxis()
Create Dot Plots for DE genes, globally expressed genes, and genes expressed in mature / immature clusters.
# Extract top 3 DE genes from each cluster for plotting
DEGenes_vector <- vector()
for (i in levels(Idents(larva))) {
DEGenes_vector = union(DEGenes_vector, head(subset(lar.markers,
cluster == i))[1:3, ]$gene)
}
DEGenes_vector = tolower(DEGenes_vector[2:length(DEGenes_vector)])
larva <- LowerCase_genes(larva)
# Plot DE genes
DotPlot(larva, features = DEGenes_vector) + RotatedAxis()
Subset the larval dataset into mature and immature cells based on cluster identity.
# Split larva into immature cells
immature_clusts = c("1", "2", "3", "5", "13", "20")
immature_cells <- WhichCells(larva, idents = immature_clusts)
# Create new Seurat object for immature cells. ClusterSeurat
# is a shortcut function that goes through the normal
# workflow.
immature <- CreateSeuratObject(Count.mat[, immature_cells])
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
immature <- ClusterSeurat(immature)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 4108
## Number of edges: 154957
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8698
## Number of communities: 10
## Elapsed time: 0 seconds
# Import cluster ID from full larva object
immature@meta.data$orig_ID <- 0
immature@meta.data[immature_cells, ]$orig_ID = as.numeric(larva@meta.data[immature_cells,
]$clusterID)
levels(immature@meta.data$orig_ID) <- as.numeric(immature_clusts)
Idents(immature) <- "orig_ID"
Create Seurat object for mature cells.
# Subset larva object
mature_clusts = setdiff(levels(larva@meta.data$clusterID), immature_clusts)
mature_cells <- WhichCells(larva, idents = mature_clusts)
# Create separate object
mature <- CreateSeuratObject(Count.mat[, mature_cells])
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
mature <- ClusterSeurat(mature)
## Centering and scaling data matrix
## PC_ 1
## Positive: FOSAB, BHLHE40, FAM107B, BHLHE41, SCG2B, BDNF, EGR4, PIM1, RGS16, NFIL3-6
## SI:DKEY-7J14.5, SI:DKEYP-72G9.4, JDP2B, SYBU, HSP70.2, NRGNA, AHCYL1, NPAS4A, SI:CH211-195B13.1, DNAJB5
## FNDC4A, HSP70.1, RGS8, ARG2, EFHD1, HSP70.3, RTN4RL2A, YWHAG1, SAMD4B, HSP70L
## Negative: FXYD6L, SI:DKEY-151G10.6, PPP1R14C, RPL3, RPLP2L, FAM60AL, STMN2B, TUBB5, RPL26, ACTB2
## RPS12, RPLP1, ACTB1, RPS2, RPS8A, RPL19, RPL7A, RPL37, RPL23, CIRBPB
## RPL36A, RPL11, RPS28, RPL22L1, TPT1, RPS19, RPS7, CSPG5A, RPL10A, RPL37A
## PC_ 2
## Positive: CALB2A, CALB2B, GNAO1B, NRGNA, POU4F1, CALM1A, TP53I11A, ZGC:162707, YWHAG1, ZGC:158291
## GNG13B, SI:DKEY-164F24.2, CABP5B, ELMO1, HMX4, HOMER3, RGS8, KCNIP3B, RTN4RL2A, ISL2B
## OLFM1B, SI:DKEY-7J14.5, ATF3, VSNL1A, EDIL3, NRN1B, SCRT1A, AHCYL1, SOX13, SUSD3
## Negative: ATP1B1B, EOMESA, ELAVL4, DUSP5, SYT1A, CRABP1A, GAS7A, ONECUT1, C1QTNF4, NMBB
## TSPAN18A, SI:DKEY-152P16.6, SLC16A3, CAMKVB, ITM2CB, PIM3, GAP43, PLCXD3, POU2F2A, NECAB1
## KCND3, CACNG7B, SI:CH211-11C15.3, CELF4, SLC6A1A, RDH10A, SI:CH211-222L21.1, P2RX3B, PLXNA4, PDLIM5B
## PC_ 3
## Positive: PCP4A, NELL2B, MAFAA, EFHD1, VAMP1, TUBB2, ZGC:65851, ATP1A3B, ZFHX3, MEF2CB
## STMN3, ELMO1, NAT8L, ATP5G1, NDUFA4, COX7B, SI:DKEY-7J14.5, ATF3, GNG13B, COX6C
## ARG2, SLC25A3B, COX4I1, ATP5G3B, CNTN1B, GRM6B, RTN4RL2A, USMG5, COX5AB, TMEM178
## Negative: EBF3A, KCNIP3B, MEIS2B, GRIN1B, ZGC:110340, PAX10, PHLDA3, BCL11BA, PRDM8B, BX957297.1
## SULF2B, CR381599.1, NRGNA, SATB2, FAM19A1A, NEUROD6A, FOXG1B, TFAP2D, PCDH10B, MAMDC2B
## POU4F3, LMO1, SOX11A, NEUROD6B, TP53I11B, SERTAD4, SYT9B, CAMKVB, SEMA3B, NEUROD2
## PC_ 4
## Positive: TSPAN18A, NR4A1, KCNIP3B, PLCXD3, C1QL3A, HSP70L, SI:DKEYP-72G9.4, NPB, HSP70.2, JUN
## SNCGA, FOSB, FOSAB, HSP70.1, HSP70.3, NMBA, APOF, MEIS2B, SI:CH211-195B15.8, TBR1B
## JDP2B, JUNBB, HSP90AA1.2, IER2, MCL1A, DBNDD2, CITED4A, GLULA, EGR4, FOSAA
## Negative: EOMESA, SOX6, ZGC:92658, CAMKVA, RDH10A, GRAPA, SI:DKEY-1H6.8, CRABP1A, UO:ION006, ADKA
## NMBB, FOXP1B, KCTD4, TMEM200B, C1QTNF4, CAMKVB, TP53I11A, FOXP2, PLSCR3B, CD82A
## ATP1A3B, RBFOX3, LRRTM1, NAT8L, PCDH18B, GRASP, PCP4A, NPTX1L, PCP4L1, IGSF11
## PC_ 5
## Positive: MEF2CB, ADCYAP1B, EFHD1, GRIA3A, NPTX1L, EOMESA, PCDH7B, ISL2B, RDH10A, SYBU
## NTRK3B, NMBB, RGS8, RGS16, KCNN3, CRABP1A, PCDH19, MEF2AA, OLFM1B, SEPT9B
## CLSTN2, CBLN4, CAMKVB, GPR78A, HOMER3, C1QTNF4, CBLN1, WLS, BTBD6A, GABRA2
## Negative: TSPAN18A, PLCXD3, C1QL3A, NPB, NMBA, CAMKVA, ZGC:92658, CFI, FOXP2, NTNG1A
## CDH8, SOX11B, FOXP1B, UO:ION006, ETV1, BARHL2, SI:CH211-195B13.1, GRAPA, PCK1, CAMK2N1A
## ATF3, RGS3A, ZGC:171482, EPB41L4A, NFIL3-6, SOX6, RIMS4, SI:CH211-67E16.11, CADM4, RPRMA
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 7298
## Number of edges: 290570
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9514
## Number of communities: 23
## Elapsed time: 0 seconds
## 21:02:17 UMAP embedding parameters a = 0.9922 b = 1.112
## 21:02:17 Read 7298 rows and found 30 numeric columns
## 21:02:17 Using Annoy for neighbor search, n_neighbors = 30
## 21:02:17 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 21:02:18 Writing NN index file to temp file /var/folders/lg/jzd8kwv13k18nl7j7q_drbkw0000gn/T//RtmpvkoHFU/file2edd5babcb05
## 21:02:18 Searching Annoy index using 1 thread, search_k = 3000
## 21:02:20 Annoy recall = 100%
## 21:02:20 Commencing smooth kNN distance calibration using 1 thread
## 21:02:20 Initializing from normalized Laplacian + noise
## 21:02:21 Commencing optimization for 500 epochs, with 305414 positive edges
## 21:02:29 Optimization finished
# Import cluster IDs from full larval object
mature@meta.data$orig_ID <- 0
mature@meta.data[mature_cells, ]$orig_ID <- as.numeric(larva@meta.data[mature_cells,
]$clusterID)
levels(mature@meta.data$orig_ID) <- as.numeric(mature_clusts)
Idents(mature) <- "orig_ID"
Plot tSNE reductions of immature and mature objects
DimPlot(immature, reduction = "tsne", group.by = "orig_ID")
DimPlot(mature, reduction = "tsne", group.by = "orig_ID")
Calculate silhoutette scores for each point in the immature and mature clusters in tSNE space. The median silhouette score is reported.
m.dist.matrix <- dist(x = Embeddings(mature, reduction = "tsne"))
m.clusters <- mature$orig_ID
m.sil <- silhouette(x = as.numeric(x = as.factor(x = m.clusters)),
dist = m.dist.matrix)
print(c("Mature silhouette score: ", median(m.sil[, 3])))
## [1] "Mature silhouette score: " "0.518967030781184"
i.dist.matrix <- dist(x = Embeddings(immature, reduction = "tsne"))
i.clusters <- immature$orig_ID
i.sil <- silhouette(x = as.numeric(x = as.factor(x = i.clusters)),
dist = i.dist.matrix)
print(c("Immature silhouette score: ", median(i.sil[, 3])))
## [1] "Immature silhouette score: " "0.416353648568413"